Compare core genome mapping method to 16S rRNA gene amplicon sequencing variants of Gardnerella
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.5 ✔ dplyr 1.0.7
## ✔ tidyr 1.1.4 ✔ stringr 1.4.0
## ✔ readr 2.0.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(phyloseq)
library(Biostrings)
library(formattable)
library(ggpubr)
library(cowplot)
`%!in%` <- negate(`%in%`)
repoDir <- ".."
dataDir <- "../../"
figureOut <- "../../FIGURES/submission_manuscript_figures"
#shotgun metadata
sgDF <- file.path(repoDir, "sumgsDF.tsv") %>%
read_tsv %>%
mutate(SampleID=as.character(SampleID),
SubjectID=as.character(SubjectID),
pctHuman=((Sickle_pairs-bbmapFiltered_pairs)/Sickle_pairs)*100) # calculate percent human reads
#ASVs from Callahan et al., 2017 PNAS Replication and Refinement
S16_path <- file.path(dataDir, "16S_data/RepRefine_Scripts/input/processed.rda")
load(S16_path)
#Gardnerella 16S fasta sequences from reference WGS strains
rRNAs16S <- file.path(dataDir, "gardPhylogeny/prokka_annotated_genomes/ffn") %>%
list.files(full.names = TRUE, pattern = "ffn") %>%
map(readDNAStringSet) %>%
do.call("c", .) %>%
.[str_detect(names(.), "16S ribosomal RNA")]
#gardnerella metadata
refDF <- file.path(repoDir, "refGardnerellaCladesGenomos.csv") %>%
read_csv
#Gardnerella clade abundance from cladeAssignments.Rmd
sgCladeDF <- file.path(repoDir, "gardCladeRelativeAbundance.tsv") %>%
read_tsv() %>%
mutate(SampleID=as.character(SampleID))
#Gardnerella genomospecies abundance from cladeAssignments.Rmd
sgGenomoDF <- file.path(repoDir, "gardGenomospeciesRelativeAbundance.tsv") %>%
read_tsv() %>%
mutate(SampleID=as.character(SampleID))
theme_set(theme_bw())
# variant colors for clades vs V4
ggplotCladeV4 <- list(scale_size_manual(values=c(1.5, 3)),
scale_color_manual(values=c("salmon", "indianred", "lightblue", "blue3", "darkorchid2", "yellowgreen", "red", "blue1", "grey48", "darkorchid2", "yellowgreen"), labels=c("Clade C1 (shotgun)", "Clade C2 (shotgun)", "Clade C3 (shotgun)", "Clade C4 (shotgun)", "Clade C5 (shotgun)", "Clade C6 (shotgun)", "G2: Clades 1/2 (V4 amplicon)", "G1: Clades 3/4 (V4 amplicon)", "G3: indeterminant (V4 amplicon)", "G4: Clade 5 (V4 amplicon)", "G5: Clade 6 (V4 amplicon)")),
scale_shape_manual(values = c(16, 1)))
# clade, genomospecies, and V4 variant colors
ggplotAll <- list(scale_size_manual(values=c(1.5, 2, 3)),
scale_color_manual(values=c("salmon", "indianred", "lightblue", "blue3", "darkorchid2", "yellowgreen",
"salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "blue4", "blue", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "darkorchid2", "yellowgreen", "cadetblue1",
"red", "blue1", "grey48", "darkorchid2", "yellowgreen"),
labels=c("Clade C1 (shotgun)", "Clade C2 (shotgun)", "Clade C3 (shotgun)", "Clade C4 (shotgun)", "Clade C5 (shotgun)", "Clade C6 (shotgun)",
"Genomospecies 1, G. vaginalis (shotgun)",
"Genomospecies 2 (shotgun)",
"Genomospecies 3 (shotgun)",
"Genomospecies 4, G. piotti (shotgun)",
"Genomospecies 11 (shotgun)",
"Genomospecies 5, G. leopoldii (shotgun)",
"Genomospecies 6, G. swidsinskii (shotgun)",
"Genomospecies 7 (shotgun)",
"Genomospecies 8 (shotgun)",
"Genomospecies 9 (shotgun)",
"Genomospecies 10 (shotgun)",
"Genomospecies 12 (shotgun)",
"Genomospecies 13 (shotgun)",
"Genomospecies 14 (shotgun)",
"G2: Clades 1/2 (V4 amplicon)", "G1: Clades 3/4 (V4 amplicon)", "G3: indetereminant (V4 amplicon)", "G4: Clade 5 (V4 amplicon)", "G5: Clade 6 (V4 amplicon)")),
scale_shape_manual(values = c(16, 5, 1)))
# clade and genomospecies variant colors
ggplotShotgunOnly <- list(scale_size_manual(values=c(1.5, 2)),
scale_color_manual(values=c("salmon", "indianred", "lightblue", "blue3", "darkorchid2", "yellowgreen",
"salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "blue4", "blue", "lightblue1", "lightblue2", "lightblue3", "lightblue4", "darkorchid2", "yellowgreen", "cadetblue1"),
labels=c("Clade C1", "Clade C2", "Clade C3", "Clade C4", "Clade C5", "Clade C6",
"Genomospecies 1, G. vaginalis",
"Genomospecies 2",
"Genomospecies 3",
"Genomospecies 4, G. piotti",
"Genomospecies 11",
"Genomospecies 5, G. leopoldii",
"Genomospecies 6, G. swidsinskii",
"Genomospecies 7",
"Genomospecies 8",
"Genomospecies 9",
"Genomospecies 10",
"Genomospecies 12",
"Genomospecies 13",
"Genomospecies 14")),
scale_shape_manual(values = c(16, 5)))
#metaphlan data
mDF0 <- file.path(dataDir, "/humann2/merged_tables/mergedMetaphlanAbundance.tsv") %>%
read_tsv() %>%
filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
select(Species, everything(), -ID) %>%
as.data.frame()
#make col and row names
samps <- colnames(mDF0[2:ncol(mDF0)]) %>%
str_extract("[0-9]{10}")
samps
## [1] "1000801248" "1000801318" "1000801368" "1001301158" "1001301248"
## [6] "1001301318" "1001801138" "1001801248" "1001801318" "1002701138"
## [11] "1002701158" "1002701278" "1003101118" "1003101188" "1003101278"
## [16] "1004001168" "1004001278" "1004001348" "1005601098" "1005601168"
## [21] "1005601348" "1050801308" "1050801318" "1050835178" "1050835218"
## [26] "1060435158" "1060435318" "1060435348" "1060801198" "1060801338"
## [31] "1060835148" "1061235118" "1061235188" "1061235278" "1061635088"
## [36] "1061635208" "1061635368" "1062101108" "1062101238" "1062101338"
## [41] "1062301128" "1062301198" "1062335178" "1062601228" "1062601258"
## [46] "1062601368" "1062701128" "1062701298" "1062735148" "1062735198"
## [51] "1062901138" "1062901218" "1062901318" "1900401198" "1900401288"
## [56] "1900401398" "1902401118" "1902401208" "1902401308" "2050501168"
## [61] "2050501298" "2050501348" "4002101198" "4002101298" "4002135018"
## [66] "4003235018" "4003235288" "4003235368" "4004235208" "4004235248"
## [71] "4004235268" "4004735328" "4004735368" "4004935158" "4004935268"
## [76] "4004935338" "4005935278" "4005935328" "4005935358" "4006635198"
## [81] "4006635258" "4006635278" "4006635338" "4006935208" "4006935248"
## [86] "4006935358" "4007135108" "4007135118" "4007135148" "4007135288"
## [91] "4007235168" "4007235278" "4007435168" "4007435248" "4007435258"
## [96] "4007535198" "4007535238" "4007535358" "4008435158" "4008435348"
## [101] "4008435358" "4009035168" "4009035268" "4009035368" "4009835178"
## [106] "4009835228" "4009835268"
colnames(mDF0) <- c("Species", samps)
rownames(mDF0) <- mDF0$Species
#Final edits to table
mDF <- mDF0 %>%
select(-Species) %>%
t() %>%
as_tibble() %>%
mutate_all(funs(./100)) %>%
mutate(SampleID=samps) %>%
select(SampleID, everything())
mDF[1:6,1:5]
## # A tibble: 6 × 5
## SampleID Actinobaculum_m… Actinobaculum_s… Actinobaculum_u… Actinomyces_eur…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1000801248 0 0 0 0
## 2 1000801318 0 0 0 0
## 3 1000801368 0 0 0 0
## 4 1001301158 0 0 0 0
## 5 1001301248 0 0 0 0
## 6 1001301318 0 0 0 0
Get V4 Variants that appear in these samples and determine corresponding clades using 16S rRNA sequences from
# all sample IDs that would have corresponding ASV data
studyIDs <- sgDF %>%
filter(PaperCohort=="2017",
RNAlater=="No") %>%
.$SampleID
# phyloseq object of data from Callahan et al., 2017 PNAS Replication and Refinement
PS17 <- phyloseq(otu_table(st, taxa_are_rows=FALSE), sample_data(df), tax_table(tax))
#transform_sample_counts(function(x) {x/sum(x)}) #make read counts into relative abundances
has.shotgun <- PS17@sam_data$SampleID %in% studyIDs
PS16S_170 <- PS17 %>%
subset_samples(has.shotgun) #keep only samples of interest in OTU table
S16gard <- PS16S_170 %>%
psmelt %>%
filter(Genus=="Gardnerella",
Abundance>0)
Note: there are 9 variants among the entire sample set for 2017, but only 5 within the shotgun samples samples
Are there any samples in which Gardnerella was not found by 16S?
noGard16S <- setdiff(studyIDs, unique(S16gard$SampleID))
noGard16S
## [1] "1062301128" "1062301198"
PS16S_170 %>%
psmelt %>%
filter(Genus=="Gardnerella",
SampleID %in% noGard16S) %>%
select(SampleID, Abundance) %>%
unique
## SampleID Abundance
## 1 1062301128 0
## 2 1062301198 0
Two from subject 10623. Was Gardnerella found by MetaPhlAn2?
mDF %>%
filter(SampleID %in% noGard16S) %>%
select(SampleID, Gardnerella_vaginalis)
## # A tibble: 2 × 2
## SampleID Gardnerella_vaginalis
## <chr> <dbl>
## 1 1062301128 0
## 2 1062301198 0
Gardnerella was not found by MetaPhlAn2 or 16S in these two samples.
Determine which V4 variants are in each reference WGS isolate
# get list of ASVs from 16S amplicon data
vars <- S16gard %>%
select(OTU) %>%
unique %>%
mutate(var=paste0("G", c(1:nrow(.)))) %>%
dplyr::rename(seq="OTU") %>%
select(var, seq)
# Determine which clades each V4 ASV maps to in Gardnerella WGS reference sequences
varClades <- rRNAs16S %>%
as.data.frame() %>%
rownames_to_column("Strain") %>%
dplyr::rename(seq=x) %>%
mutate(Strain=str_extract(Strain, ".*(?=_[0-9]{5} 16S ribosomal RNA$)"),
seq=str_extract(seq, paste(vars$seq, collapse = "|"))) %>%
filter(!is.na(seq)) %>%
left_join(refDF, by="Strain") %>%
left_join(vars, by="seq") %>%
select(var, Clade) %>%
unique
varClades
## var Clade
## 1 G1 C2
## 3 G2 C3
## 4 G1 C1
## 6 G3 C1
## 10 G2 C4
## 21 G4 C5
## 27 G3 C2
## 34 G5 C6
## 48 G3 C3
NOTE: THESE LABELS DO NOT MATCH Callahan 2017 G1/G2 ARE FLIPPED
Get abundances of variants in each sample that has 16S and shotgun sequencing and format tables
vars0 <- vars %>%
spread(var, seq)
S16gardDF <- S16gard %>%
mutate(Var=case_when(OTU==vars0$G1~"G2: Clades 1/2", #change G1/G2 to match Callahan 2017
OTU==vars0$G2~"G1: Clades 3/4",
OTU==vars0$G3~"G3: indetereminant",
OTU==vars0$G4~"G4: Clade 5",
OTU==vars0$G5~"G5: Clade 6")) %>%
group_by(SampleID) %>%
mutate(propGard=Abundance/sum(Abundance)) %>%
ungroup() %>%
select(OTU, SampleID, SubjectID, Abundance, propGard, GWcoll, Var) #RNAlater
S16gardDF$Var <- factor(S16gardDF$Var, levels = c("G2: Clades 1/2", "G1: Clades 3/4", "G3: indetereminant", "G4: Clade 5", "G5: Clade 6")) #Keep order G2, G1, G3, G4, G5
Place clade, genomospecies, and V4 variant abundances into one dataframe
S16gardDF0 <- S16gardDF %>%
select(SampleID, Var, propGard) %>%
mutate(Method="V4 region 16S amplicon")
sgCladeDF0 <- sgCladeDF %>%
filter(SampleID %in% studyIDs) %>%
dplyr::rename(Var=Clade,
propGard=propClade) %>%
mutate(Var=case_when(Var=="C1"~"Clade 1",
Var=="C2"~"Clade 2",
Var=="C3"~"Clade 3",
Var=="C4"~"Clade 4",
Var=="C5"~"Clade 5",
Var=="C6"~"Clade 6"),
Method="Shotgun Clade")
sgGenomoDF0 <- sgGenomoDF %>%
filter(SampleID %in% studyIDs) %>%
dplyr::rename(Var=Genomospecies,
propGard=propGenomospecies) %>%
mutate(Var=as.character(Var),
Var=case_when(Var=="GS1"~"Genomospecies 1, G. vaginalis",
Var=="GS2"~"Genomospecies 2",
Var=="GS3"~"Genomospecies 3",
Var=="GS4"~"Genomospecies 4, G. piotti",
Var=="GS5"~"Genomospecies 5, G. leopoldii",
Var=="GS6"~"Genomospecies 6, G. swidsinskii",
Var=="GS7"~"Genomospecies 7",
Var=="GS8"~"Genomospecies 8",
Var=="GS9"~"Genomospecies 9",
Var=="GS10"~"Genomospecies 10",
Var=="GS11"~"Genomospecies 11",
Var=="GS12"~"Genomospecies 12",
Var=="GS13"~"Genomospecies 13",
Var=="GS14"~"Genomospecies 14"),
Method="Shotgun Genomospecies")
gardAllVarDF <- S16gardDF0 %>%
full_join(sgCladeDF0, by=c("SampleID", "Var", "propGard", "Method")) %>%
full_join(sgGenomoDF0) %>%
left_join(sgDF[,c("SampleID", "SubjectID", "GWcoll")], by="SampleID") %>%
mutate(Var=factor(Var, levels = c("Clade 1", "Clade 2", "Clade 3", "Clade 4", "Clade 5", "Clade 6",
"Genomospecies 1, G. vaginalis", "Genomospecies 2", "Genomospecies 3", "Genomospecies 4, G. piotti", "Genomospecies 11", "Genomospecies 5, G. leopoldii", "Genomospecies 6, G. swidsinskii", "Genomospecies 7", "Genomospecies 8", "Genomospecies 9", "Genomospecies 10", "Genomospecies 12", "Genomospecies 13", "Genomospecies 14",
"G2: Clades 1/2", "G1: Clades 3/4", "G3: indetereminant", "G4: Clade 5", "G5: Clade 6")))
head(gardAllVarDF)
## # A tibble: 6 × 7
## SampleID Var propGard Method n SubjectID GWcoll
## <chr> <fct> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1062101108 G2: Clades 1/2 1 V4 region 16S ampli… NA 10621 10
## 2 1062101338 G2: Clades 1/2 1 V4 region 16S ampli… NA 10621 33
## 3 1062101238 G2: Clades 1/2 1 V4 region 16S ampli… NA 10621 23.1
## 4 1902401208 G1: Clades 3/4 0.923 V4 region 16S ampli… NA 19024 20.3
## 5 1902401308 G1: Clades 3/4 0.816 V4 region 16S ampli… NA 19024 30.4
## 6 1005601168 G2: Clades 1/2 1 V4 region 16S ampli… NA 10056 16.6
unique(gardAllVarDF$Var)
## [1] G2: Clades 1/2 G1: Clades 3/4
## [3] G3: indetereminant G4: Clade 5
## [5] G5: Clade 6 Clade 1
## [7] Clade 2 Clade 4
## [9] Clade 5 Clade 3
## [11] Clade 6 Genomospecies 1, G. vaginalis
## [13] Genomospecies 2 Genomospecies 3
## [15] Genomospecies 4, G. piotti Genomospecies 5, G. leopoldii
## [17] Genomospecies 12 Genomospecies 10
## [19] Genomospecies 11 Genomospecies 13
## [21] Genomospecies 14 Genomospecies 6, G. swidsinskii
## [23] Genomospecies 7 Genomospecies 8
## [25] Genomospecies 9
## 25 Levels: Clade 1 Clade 2 Clade 3 Clade 4 Clade 5 ... G5: Clade 6
First pass assignmentby mapping reads to variants without imposing any read count thresholds.
#fig.width=5, fig.height=4
gardAllVarDF %>%
ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
geom_point() +
ggplotAll +
facet_wrap(~SubjectID, nrow=2) +
labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
xlim(0,40)
Shotgun variants only with N reads on y-axis
gardAllVarDF %>%
filter(str_detect(Method, "Shotgun")) %>%
ggplot(aes(x=GWcoll, y=n, shape=Method, size=Method, color=Var)) +
geom_point() +
ggplotShotgunOnly +
scale_y_log10() +
facet_wrap(~SubjectID, nrow=2) +
labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
xlim(0,40)
Determine how many reads must map to each variant to count the variant as present. View variants over ranges of thresholds:
#fig.height=6, fig.width=9
# create data frames
thresholds <- c(1:49) %>% # set the threshold range
map(~filter(gardAllVarDF, n>.x)) %>% # filter out clades below read threshold
map(~group_by(.x, SampleID, Method)) %>%
map(~mutate(.x, propGard=propGard/sum(propGard))) %>% # recalculate proportions after threshold filtering
map(ungroup) %>%
map2(c(1:49), ~mutate(.x, Threshold=.y)) %>% # add threshold column
purrr::reduce(full_join, by=c(colnames(gardAllVarDF), "Threshold")) # reduce to one data frame
# add threshold information to data frame
gardAllVarDFThresholds <- gardAllVarDF %>%
mutate(Threshold=0) %>% # set zero threshold for baseline
rbind(thresholds) %>%
mutate(Threshold=Threshold+1) # make threshold as >=n
# plot - thresholds 1 to 50
gardAllVarDFThresholds %>%
ggplot(aes(x=Threshold, y=propGard, shape=Method, size=Method, color=Var)) +
geom_point() +
ggplotAll +
ylim(0,1) +
scale_x_continuous(breaks = seq(0,50, 5)) +
facet_wrap(~SampleID) +
#guides(color=guide_legend(override.aes=list(shape=c(rep(16, 6), rep(5, 5)), size=c(rep(2, 6), rep(4, 5))))) +
labs(x="Threshold (reads ≥ n)", y="Proportion of Gardnerella")
# plot - thresholds 1 to 5
gardAllVarDFThresholds %>%
filter(Threshold<=5) %>%
ggplot(aes(x=Threshold, y=propGard, shape=Method, size=Method, color=Var)) +
geom_point() +
ggplotAll +
ylim(0,1) +
scale_x_continuous(breaks = seq(0,5, 1)) +
facet_wrap(~SampleID) +
#guides(color=guide_legend(override.aes=list(shape=c(rep(16, 6), rep(5, 5)), size=c(rep(2, 6), rep(4, 5))))) +
labs(x="Threshold (reads ≥ n)", y="Proportion of Gardnerella")
View count of variants over ranges of thresholds
#fig.height=5, fig.width=6
V4Vars <- gardAllVarDF %>%
filter(Method=="V4 region 16S amplicon") %>%
select(SampleID, Var, propGard) %>%
group_by(SampleID) %>%
summarise(V4vars=n())
sum(V4Vars$V4vars)
## [1] 56
varCounts <- gardAllVarDFThresholds %>%
filter(Method!="V4 region 16S amplicon") %>%
select(SampleID, Method, Var, propGard, Threshold) %>%
group_by(SampleID, Method, Threshold) %>%
summarise(nVars=n()) %>%
spread(Method, nVars) %>%
ungroup() %>%
full_join(map(1:50, ~mutate(V4Vars, Threshold=.x)) %>%
purrr::reduce(full_join, by = c("SampleID", "V4vars", "Threshold")),
by=c("SampleID", "Threshold")) %>%
replace_na(list(shotgunVars=0)) %>%
gather("Method", "nVars", c(`Shotgun Clade`, `Shotgun Genomospecies`, `V4vars`))
# Thresholds 1:50
varCounts %>%
ggplot(aes(x=Threshold, y=nVars, group=Method, color=Method, linetype=Method)) +
geom_line() +
facet_wrap(~SampleID, scales="free_y") +
scale_color_manual(values=c("#56B4E9", "#009E73", "#D55E00"), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
scale_color_manual(values=c("#56B4E9", "#009E73", "#D55E00"), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
labs(x="Threshold (reads≥n)", y="N Variants", color="Method")
varCounts %>%
ggplot(aes(x=Threshold, y=nVars, group=Method, color=Method, linetype=Method)) +
geom_line() +
facet_wrap(~SampleID, scales = "free_y") +
xlim(1,5) +
scale_linetype_manual(values=c(1,2,3), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
scale_color_manual(values=c("#56B4E9", "#009E73", "#D55E00"), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
labs(x="Threshold (reads≥n)", y="N Variants", color="Method")
Determine the how well variants correspond to each other.
# Samples we are testing
testSamples <- gardAllVarDF %>%
.$SampleID %>%
unique
# Number of V4 variants in these samples
nV4 <- gardAllVarDF %>% # number of v4 variants
filter(Method=="V4 region 16S amplicon") %>%
select(SampleID, Var) %>%
nrow
# Number of Clades found in all samples at each threshold up to 5
nClades <- gardAllVarDFThresholds %>%
filter(Method=="Shotgun Clade",
Threshold <= 5) %>%
select(SampleID, Var, Threshold) %>%
group_by(Threshold) %>%
summarise(Total=n())
# Number of Genomospecies found in all samples at each threshold up to 5
nGenomos <- gardAllVarDFThresholds %>%
filter(Method=="Shotgun Genomospecies",
Threshold <= 5) %>%
select(SampleID, Var, Threshold) %>%
group_by(Threshold) %>%
summarise(Total=n())
Sensitivity as percent of V4 variants or clades that are not represented by a clade or genomospecies
# define function to determine number of V4 variants or clades that are not represented by a clade or genomospecies
testSens <- function(testSample, comparison, thresh) {
if(str_detect(comparison, "V4")){
a <- gardAllVarDF %>%
filter(SampleID == testSample,
Method=="V4 region 16S amplicon") %>%
dplyr::rename(V4=Method) %>%
select(Var, V4)
if("G3: indetereminant" %in% a$Var){
a <- a %>%
bind_rows(data.frame(Var=c("G2: Clades 1/2", "G1: Clades 3/4"), V4=c("V4 region 16S amplicon", "V4 region 16S amplicon"))) %>%
unique %>%
filter(Var!="G3: indetereminant")
}
if(comparison=="V4_clade"){
b <- gardAllVarDFThresholds %>%
filter(SampleID == testSample,
Threshold == thresh,
Method=="Shotgun Clade") %>%
mutate(Var=case_when(Var=="Clade 1"~"G2: Clades 1/2",
Var=="Clade 2"~"G2: Clades 1/2",
Var=="Clade 3"~"G1: Clades 3/4",
Var=="Clade 4"~"G1: Clades 3/4",
Var=="Clade 5"~"G4: Clade 5",
Var=="Clade 6"~"G5: Clade 6"),
Shotgun=Method) %>%
select(Var, Shotgun) %>%
unique
}
if(comparison=="V4_genomospecies"){
b <- gardAllVarDFThresholds %>%
filter(SampleID == testSample,
Threshold == thresh,
Method=="Shotgun Genomospecies") %>%
mutate(Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"G2: Clades 1/2",
Var=="Genomospecies 2"~"G2: Clades 1/2",
Var=="Genomospecies 3"~"G2: Clades 1/2",
Var=="Genomospecies 4, G. piotti"~"G2: Clades 1/2",
Var=="Genomospecies 11"~"G2: Clades 1/2",
Var=="Genomospecies 5, G. leopoldii"~"G1: Clades 3/4",
Var=="Genomospecies 6, G. swidsinskii"~"G1: Clades 3/4",
Var=="Genomospecies 7"~"G1: Clades 3/4",
Var=="Genomospecies 8"~"G1: Clades 3/4",
Var=="Genomospecies 9"~"G1: Clades 3/4",
Var=="Genomospecies 10"~"G1: Clades 3/4",
Var=="Genomospecies 14"~"G1: Clades 3/4",
Var=="Genomospecies 12"~"G4: Clade 5",
Var=="Genomospecies 13"~"G5: Clade 6"),
Shotgun=Method) %>%
select(Var, Shotgun) %>%
unique}
c <- a %>%
full_join(b, by="Var") %>%
group_by(Shotgun) %>%
tally %>%
filter(is.na(Shotgun)) %>%
.$n
}
if(comparison=="clade_genomospecies"){
a <- gardAllVarDFThresholds %>%
filter(SampleID == testSample,
Threshold == thresh,
Method =="Shotgun Clade") %>%
dplyr::rename(Clade=Method) %>%
select(Var, Clade)
b <- gardAllVarDFThresholds %>%
filter(SampleID == testSample,
Threshold == thresh,
Method=="Shotgun Genomospecies") %>%
mutate(Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"Clade 1",
Var=="Genomospecies 2"~"Clade 1",
Var=="Genomospecies 3"~"Clade 2",
Var=="Genomospecies 4, G. piotti"~"Clade 2",
Var=="Genomospecies 11"~"Clade 2",
Var=="Genomospecies 5, G. leopoldii"~"Clade 4",
Var=="Genomospecies 6, G. swidsinskii"~"Clade 4",
Var=="Genomospecies 7"~"Clade 3",
Var=="Genomospecies 8"~"Clade 3",
Var=="Genomospecies 9"~"Clade 3",
Var=="Genomospecies 10"~"Clade 3",
Var=="Genomospecies 14"~"Clade 3",
Var=="Genomospecies 12"~"Clade 5",
Var=="Genomospecies 13"~"Clade 6"),
Genomospecies=Method) %>%
select(Var, Genomospecies) %>%
unique
c <- a %>%
full_join(b, by="Var") %>%
group_by(Genomospecies) %>%
tally %>%
filter(is.na(Genomospecies)) %>%
.$n
}
return(c)
}
data.frame(testSample=rep(testSamples, 5),# set variables for testSens function
comparison=rep("V4_clade", 135),
thresh=rep(c(1:5), 27)) %>%
bind_cols(z=pmap(., testSens) %>% #run testSens to find number of missing variables for each test
modify_if(~length(.)==0, ~0) %>%
flatten_dbl) %>%
group_by(thresh) %>%
summarize(nMissing=sum(z)) %>% # count N missing at each threshold
dplyr::rename(Threshold=thresh) %>% # format results table
mutate(Total = nV4,
`Percent Missing`=round(nMissing/Total*100, 1)) %>%
formattable
| Threshold | nMissing | Total | Percent Missing |
|---|---|---|---|
| 1 | 5 | 56 | 8.9 |
| 2 | 5 | 56 | 8.9 |
| 3 | 5 | 56 | 8.9 |
| 4 | 6 | 56 | 10.7 |
| 5 | 6 | 56 | 10.7 |
data.frame(testSample=rep(testSamples, 5), # set variables for testSens function
comparison=rep("V4_genomospecies", 135),
thresh=rep(c(1:5), 27)) %>%
bind_cols(z=pmap(., testSens) %>% # run testSens to find number of missing variants to each test
modify_if(~length(.)==0, ~0) %>%
flatten_dbl) %>%
group_by(thresh) %>%
summarize(nMissing=sum(z)) %>% # count N missing at each threshold
dplyr::rename(Threshold=thresh) %>% # format results table
mutate(Total=nV4,
`Percent Missing`=round(nMissing/Total*100, 1)) %>%
formattable
| Threshold | nMissing | Total | Percent Missing |
|---|---|---|---|
| 1 | 5 | 56 | 8.9 |
| 2 | 5 | 56 | 8.9 |
| 3 | 7 | 56 | 12.5 |
| 4 | 7 | 56 | 12.5 |
| 5 | 8 | 56 | 14.3 |
data.frame(testSample=rep(testSamples, 5), # set variables for testSens function
comparison=rep("clade_genomospecies", 135),
thresh=rep(c(1:5), 27)) %>%
bind_cols(z=pmap(., testSens) %>% # run testSens to find number of missing variants to each test
modify_if(~length(.)==0, ~0) %>%
flatten_dbl) %>%
group_by(thresh) %>%
summarize(nMissing=sum(z)) %>% # count N missing at each threshold
dplyr::rename(Threshold=thresh) %>% # format results table
left_join(nClades, by="Threshold") %>%
mutate(`Percent Missing`=round(nMissing/Total*100, 1)) %>%
formattable
| Threshold | nMissing | Total | Percent Missing |
|---|---|---|---|
| 1 | 8 | 76 | 10.5 |
| 2 | 11 | 72 | 15.3 |
| 3 | 11 | 69 | 15.9 |
| 4 | 10 | 67 | 14.9 |
| 5 | 13 | 67 | 19.4 |
Precision here defined as number of clades or genomospecies that are not supported by a V4 variant
# define function to determine the number of clades or genomospecies that are not supported by a V4 variant
testSpec <- function(testSample, comparison, thresh) {
a <- gardAllVarDF %>%
filter(SampleID == testSample,
Method=="V4 region 16S amplicon") %>%
dplyr::rename(V4=Method) %>%
select(Var, V4)
if("G3: indetereminant" %in% a$Var){
a <- a %>%
bind_rows(data.frame(Var=c("G2: Clades 1/2", "G1: Clades 3/4"), V4=c("V4 region 16S amplicon", "V4 region 16S amplicon"))) %>%
unique %>%
filter(Var!="G3: indetereminant")
}
if(comparison=="V4_clade"){
b <- gardAllVarDFThresholds %>%
filter(SampleID == testSample,
Threshold == thresh,
Method=="Shotgun Clade") %>%
mutate(Var=case_when(Var=="Clade 1"~"G2: Clades 1/2",
Var=="Clade 2"~"G2: Clades 1/2",
Var=="Clade 3"~"G1: Clades 3/4",
Var=="Clade 4"~"G1: Clades 3/4",
Var=="Clade 5"~"G4: Clade 5",
Var=="Clade 6"~"G5: Clade 6"),
Shotgun=Method) %>%
select(Var, Shotgun) %>%
unique
}
if(comparison=="V4_genomospecies"){
b <- gardAllVarDFThresholds %>%
filter(SampleID == testSample,
Threshold == thresh,
Method=="Shotgun Genomospecies") %>%
mutate(Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"G2: Clades 1/2",
Var=="Genomospecies 2"~"G2: Clades 1/2",
Var=="Genomospecies 3"~"G2: Clades 1/2",
Var=="Genomospecies 4, G. piotti"~"G2: Clades 1/2",
Var=="Genomospecies 11"~"G2: Clades 1/2",
Var=="Genomospecies 5, G. leopoldii"~"G1: Clades 3/4",
Var=="Genomospecies 6, G. swidsinskii"~"G1: Clades 3/4",
Var=="Genomospecies 7"~"G1: Clades 3/4",
Var=="Genomospecies 8"~"G1: Clades 3/4",
Var=="Genomospecies 9"~"G1: Clades 3/4",
Var=="Genomospecies 10"~"G1: Clades 3/4",
Var=="Genomospecies 14"~"G1: Clades 3/4",
Var=="Genomospecies 12"~"G4: Clade 5",
Var=="Genomospecies 13"~"G5: Clade 6"),
Shotgun=Method) %>%
select(Var, Shotgun) %>%
unique
}
c <- a %>%
full_join(b, by="Var") %>%
group_by(V4) %>%
tally %>%
filter(is.na(V4)) %>%
.$n
return(c)}
data.frame(testSample=rep(testSamples, 5), # set variables for testSpec function
comparison=rep("V4_clade", 135),
thresh=rep(c(1:5), 27)) %>%
bind_cols(z=pmap(., testSpec) %>% # run testSpec to find number of missing variants to each test
modify_if(~length(.)==0, ~0) %>%
flatten_dbl) %>%
group_by(thresh) %>%
summarize(nUnsupported=sum(z)) %>% # count N unsupported variants at each threshold
dplyr::rename(Threshold=thresh) %>% # format results table
left_join(nClades, by="Threshold") %>%
mutate(`Percent Unsupported`=round(nUnsupported/Total*100, 1)) %>%
formattable
| Threshold | nUnsupported | Total | Percent Unsupported |
|---|---|---|---|
| 1 | 4 | 76 | 5.3 |
| 2 | 2 | 72 | 2.8 |
| 3 | 2 | 69 | 2.9 |
| 4 | 2 | 67 | 3.0 |
| 5 | 2 | 67 | 3.0 |
data.frame(testSample=rep(testSamples, 5), # set variables for testSpec function
comparison=rep("V4_genomospecies", 135),
thresh=rep(c(1:5), 27)) %>%
bind_cols(z=pmap(., testSpec) %>% # run testSpec to find number of missing variants at each test
modify_if(~length(.)==0, ~0) %>%
flatten_dbl) %>%
group_by(thresh) %>%
summarize(nUnsupported=sum(z)) %>% # count N unsupported variants at each threshold
dplyr::rename(Threshold=thresh) %>% # format results table
left_join(nGenomos, by="Threshold") %>%
mutate(`Percent Unsupported`=round(nUnsupported/Total*100, 1)) %>%
formattable
| Threshold | nUnsupported | Total | Percent Unsupported |
|---|---|---|---|
| 1 | 4 | 123 | 3.3 |
| 2 | 2 | 107 | 1.9 |
| 3 | 2 | 101 | 2.0 |
| 4 | 1 | 98 | 1.0 |
| 5 | 1 | 91 | 1.1 |
Choose a threshold of ≥2 reads for present. Ignore all one-read assignments
gardAllVarDF_withThreshold <- gardAllVarDFThresholds %>%
filter(str_detect(Method, "Shotgun")&Threshold==2|str_detect(Method, "V4")&Threshold==1) %>%
select(-Threshold)
V4 variants found
gardAllVarDF_withThreshold %>%
filter(Method=="V4 region 16S amplicon") %>%
count(SampleID) %>%
summarize(min=min(n), max=max(n), mean=mean(n), total=sum(n))
## # A tibble: 1 × 4
## min max mean total
## <int> <int> <dbl> <int>
## 1 1 5 2.15 56
Clades found
gardAllVarDF_withThreshold %>%
filter(Method=="Shotgun Clade") %>%
count(SampleID) %>%
summarize(min=min(n), max=max(n), mean=mean(n), total=sum(n))
## # A tibble: 1 × 4
## min max mean total
## <int> <int> <dbl> <int>
## 1 1 6 3.13 72
Genomospecies found
gardAllVarDF_withThreshold %>%
filter(Method=="Shotgun Genomospecies") %>%
count(SampleID) %>%
summarize(min=min(n), max=max(n), mean=mean(n), total=sum(n))
## # A tibble: 1 × 4
## min max mean total
## <int> <int> <dbl> <int>
## 1 1 13 4.65 107
gardAllVarDF_withThreshold %>%
ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
geom_point() +
ggplotAll +
facet_wrap(~SubjectID, nrow=2) +
labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
xlim(0,40)
gardAllVarDF_withThreshold %>%
filter(Method!="Shotgun Genomospecies") %>%
ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
geom_point() +
ggplotCladeV4 +
facet_wrap(~SubjectID, nrow=2) +
labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
xlim(0,40)
gardAllVarDF_withThreshold %>%
filter(Method!="V4 region 16S amplicon") %>%
ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
geom_point() +
ggplotShotgunOnly +
facet_wrap(~SubjectID, nrow=2) +
labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
guides(color=guide_legend(override.aes=list(shape=15, size=3), ncol=2)) +
xlim(0,40)
# samples with variant G3
g3Samples <- gardAllVarDF_withThreshold %>%
filter(Var=="G3: indetereminant") %>%
.$SampleID
G3_50plus <- gardAllVarDF_withThreshold %>%
filter(Var=="G3: indetereminant") %>%
mutate(propGard=round(propGard, 2)) %>%
filter(propGard>=.50)
G3_50plus
## # A tibble: 2 × 7
## SampleID Var propGard Method n SubjectID GWcoll
## <chr> <fct> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1062701128 G3: indetereminant 0.65 V4 region 16S a… NA 10627 12
## 2 2050501168 G3: indetereminant 0.5 V4 region 16S a… NA 20505 16.4
Sample 1062701128 has 65% G3, and sample 2050501168 has 50% G3.
cladeVs16SPlot <- gardAllVarDF_withThreshold %>%
filter(Method!="Shotgun Genomospecies") %>%
mutate(Var=as.character(Var),
Var=case_when(Var=="Clade 1"~"G2: Clades 1/2",
Var=="Clade 2"~"G2: Clades 1/2",
Var=="Clade 3"~"G1: Clades 3/4",
Var=="Clade 4"~"G1: Clades 3/4",
Var=="Clade 5"~"G4: Clade 5",
Var=="Clade 6"~"G5: Clade 6",
str_detect(Var, "G")~Var)) %>%
group_by(SampleID, Method, Var) %>%
summarise(propGard=sum(propGard)) %>%
ungroup %>%
spread(Method, propGard) %>%
replace_na(list(`V4 region 16S amplicon`=0,
`Shotgun Clade`=0)) %>%
mutate(`≥50% G3`=SampleID %in% G3_50plus$SampleID) %>%
filter(Var!="G3: indetereminant") %>%
ggplot(aes(x=`V4 region 16S amplicon`, y=`Shotgun Clade`)) +
geom_abline(aes(intercept=0, slope=1), color="gray", linetype=2) +
geom_smooth(method="lm", se=FALSE, color="black", linetype=1) +
stat_cor(method = "pearson", alternative = "two.sided", cor.coef.name = c("R"), p.digits = 3, label.y = 0.875, label.x = 0) +
geom_point(aes(color=Var, shape=`≥50% G3`), alpha=0.6, size=2) +
scale_color_manual(values=c("#b22222", "#214fc6", "#E69F00", "#009E73")) +
theme(legend.position = c(0.6, 0.25),
legend.direction = "vertical",
legend.text = element_text(size=7),
legend.title = element_text(size=8),
legend.background = element_blank()) +
guides(shape="none") +
labs(color="V4 Variant")
cladeVs16SPlot
gsVs16SPlot <- gardAllVarDF_withThreshold %>%
filter(Method!="Shotgun Clade") %>%
mutate(Var=as.character(Var),
Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"G2: Clades 1/2",
Var=="Genomospecies 2"~"G2: Clades 1/2",
Var=="Genomospecies 3"~"G2: Clades 1/2",
Var=="Genomospecies 4, G. piotti"~"G2: Clades 1/2",
Var=="Genomospecies 11"~"G2: Clades 1/2",
Var=="Genomospecies 5, G. leopoldii"~"G1: Clades 3/4",
Var=="Genomospecies 6, G. swidsinskii"~"G1: Clades 3/4",
Var=="Genomospecies 7"~"G1: Clades 3/4",
Var=="Genomospecies 8"~"G1: Clades 3/4",
Var=="Genomospecies 9"~"G1: Clades 3/4",
Var=="Genomospecies 10"~"G1: Clades 3/4",
Var=="Genomospecies 14"~"G1: Clades 3/4",
Var=="Genomospecies 12"~"G4: Clade 5",
Var=="Genomospecies 13"~"G5: Clade 6",
str_detect(Var, "G")~Var),
Var=case_when(Var=="G2: Clades 1/2"~"G2: Genomospeces 1/2/3/4/11",
Var=="G1: Clades 3/4"~"G1: Genomospeces 5/6/7/8/9/10/14",
Var=="G4: Clade 5"~"G4: Genomospecies 12",
Var=="G5: Clade 6"~"G5: Genomospecies 13")) %>%
group_by(SampleID, Method, Var) %>%
summarise(propGard=sum(propGard)) %>%
ungroup %>%
spread(Method, propGard) %>%
replace_na(list(`V4 region 16S amplicon`=0,
`Shotgun Genomospecies`=0)) %>%
mutate(`≥50% G3`=SampleID %in% G3_50plus$SampleID) %>%
filter(Var!="G3: indetereminant") %>%
ggplot(aes(x=`V4 region 16S amplicon`, y=`Shotgun Genomospecies`)) +
geom_abline(aes(intercept=0, slope=1), color="gray", linetype=2) +
geom_smooth(method="lm", se=FALSE, color="black", linetype=1) +
stat_cor(method = "pearson", alternative = "two.sided", cor.coef.name = c("R"), p.digits = 3, label.y = 0.875, label.x = 0) +
geom_point(aes(color=Var, shape=`≥50% G3`), alpha=0.6, size=2) +
scale_color_manual(values=c("#b22222", "#214fc6", "#E69F00", "#009E73")) +
theme(legend.position = c(0.675, 0.25),
legend.direction = "vertical",
legend.text = element_text(size=7),
legend.title = element_text(size=8),
legend.background = element_blank()) +
guides(shape="none") +
labs(color="V4 Variant")
gsVs16SPlot
gardAllVarDF_withThreshold %>%
filter(Method=="Shotgun Clade") %>%
.$Var %>%
unique
## [1] Clade 1 Clade 2 Clade 4 Clade 3 Clade 5 Clade 6
## 25 Levels: Clade 1 Clade 2 Clade 3 Clade 4 Clade 5 ... G5: Clade 6
cladeVsGsPlot <- gardAllVarDF_withThreshold %>%
filter(Method!="V4 region 16S amplicon") %>%
mutate(Var=as.character(Var),
Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"Clade 1",
Var=="Genomospecies 2"~"Clade 1",
Var=="Genomospecies 3"~"Clade 2",
Var=="Genomospecies 4, G. piotti"~"Clade 2",
Var=="Genomospecies 11"~"Clade 2",
Var=="Genomospecies 5, G. leopoldii"~"Clade 4",
Var=="Genomospecies 6, G. swidsinskii"~"Clade 4",
Var=="Genomospecies 7"~"Clade 3",
Var=="Genomospecies 8"~"Clade 3",
Var=="Genomospecies 9"~"Clade 3",
Var=="Genomospecies 10"~"Clade 3",
Var=="Genomospecies 14"~"Clade 3",
Var=="Genomospecies 12"~"Clade 5",
Var=="Genomospecies 13"~"Clade 6",
str_detect(Var, "Clade")~Var),
Var=case_when(Var=="Clade 1"~"Clade 1: Genomospeces 1/2",
Var=="Clade 2"~"Clade 2: Genomospeces 3/4/11",
Var=="Clade 3"~"Clade 3: Genomospecies 7/8/9/10/14",
Var=="Clade 4"~"Clade 4: Genomospecies 5/6",
Var=="Clade 5"~"Clade 5: Genomospecies 12",
Var=="Clade 6"~"Clade 6: Genomospecies 13")) %>%
group_by(SampleID, Method, Var) %>%
summarise(propGard=sum(propGard)) %>%
ungroup %>%
spread(Method, propGard) %>%
replace_na(list(`Shotgun Clade`=0,
`Shotgun Genomospecies`=0)) %>%
ggplot(aes(x=`Shotgun Clade`, y=`Shotgun Genomospecies`)) +
geom_abline(aes(intercept=0, slope=1), color="gray", linetype=2) +
geom_smooth(method="lm", se=FALSE, color="black", linetype=1) +
stat_cor(method = "pearson", alternative = "two.sided", cor.coef.name = c("R"), p.digits = 3, label.y = 0.875, label.x = 0) +
geom_point(aes(color=Var), alpha=0.6, size=2) +
scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73")) +
theme(legend.position = c(0.71, 0.22),
legend.direction = "vertical",
legend.text = element_text(size=7),
legend.title = element_text(size=8),
legend.background = element_blank()) +
labs(color="Clade")
cladeVsGsPlot
#fig.width=6, fig.height=2
plot_grid(plotlist=list(cladeVs16SPlot, gsVs16SPlot, cladeVsGsPlot), nrow = 1, labels=c("A", "B", "C"), label_size = 15)
#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "figure2ABC_mappingvs16S.png", sep="_")))
In subject 10608: No Gardnerella was detected. Shotgun sequencing can demonstrate less sensitivity than amplicon sequencing.
Subject 10608
#gardnerella found by metaphlan in subject 10608
gardAllVarDF %>%
filter(SubjectID==10608) %>%
select(SampleID, Var, propGard, Method)
## # A tibble: 3 × 4
## SampleID Var propGard Method
## <chr> <fct> <dbl> <chr>
## 1 1060801198 G1: Clades 3/4 0.713 V4 region 16S amplicon
## 2 1060801198 G4: Clade 5 0.287 V4 region 16S amplicon
## 3 1060801338 G1: Clades 3/4 1 V4 region 16S amplicon
samples10608 <- gardAllVarDF %>%
filter(SubjectID==10608) %>%
.$SampleID %>%
unique
samples10608
## [1] "1060801198" "1060801338"
#percent Gardnerella in these samples
mDF %>%
select(SampleID, Gardnerella_vaginalis) %>%
filter(SampleID %in% samples10608)
## # A tibble: 2 × 2
## SampleID Gardnerella_vaginalis
## <chr> <dbl>
## 1 1060801198 0
## 2 1060801338 0
#read depth of these samples
sgDF %>%
filter(SampleID %in% samples10608) %>%
select(SampleID, TotalPairs, bbmapFiltered_pairs, pctHuman)
## # A tibble: 2 × 4
## SampleID TotalPairs bbmapFiltered_pairs pctHuman
## <chr> <dbl> <dbl> <dbl>
## 1 1060801198 14655421 258847 98.0
## 2 1060801338 15223063 195118 98.5
ggplot(data=sgDF) +
geom_density(aes(x=TotalPairs)) +
geom_vline(xintercept = sgDF$TotalPairs[sgDF$SampleID==samples10608[1]], color="red") +
geom_vline(xintercept = sgDF$TotalPairs[sgDF$SampleID==samples10608[2]], color="blue")
ggplot(data=sgDF) +
geom_density(aes(x=bbmapFiltered_pairs)) +
geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID==samples10608[1]], color="red") +
geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID==samples10608[2]], color="blue")
Sample 1050801318
#No gardnerella found by metaphlan in sample 1050801318
gardAllVarDF %>%
filter(SubjectID=="10508") %>%
select(SampleID, Var, propGard, Method)
## # A tibble: 8 × 4
## SampleID Var propGard Method
## <chr> <fct> <dbl> <chr>
## 1 1050801308 G1: Clades 3/4 0.799 V4 region 16S amplicon
## 2 1050801308 G3: indetereminant 0.201 V4 region 16S amplicon
## 3 1050801318 G2: Clades 1/2 1 V4 region 16S amplicon
## 4 1050801308 Clade 2 0.231 Shotgun Clade
## 5 1050801308 Clade 4 0.769 Shotgun Clade
## 6 1050801308 Genomospecies 3 0.286 Shotgun Genomospecies
## 7 1050801308 Genomospecies 4, G. piotti 0.143 Shotgun Genomospecies
## 8 1050801308 Genomospecies 5, G. leopoldii 0.571 Shotgun Genomospecies
#read depth of all sub samples
sgDF %>%
filter(SampleID=="1050801318") %>%
select(SampleID, TotalPairs, bbmapFiltered_pairs, pctHuman)
## # A tibble: 1 × 4
## SampleID TotalPairs bbmapFiltered_pairs pctHuman
## <chr> <dbl> <dbl> <dbl>
## 1 1050801318 18029410 494084 96.8
#Bacterial reads in all subject 10508 samples
ggplot(data=sgDF) +
geom_density(aes(x=bbmapFiltered_pairs)) +
geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID=="1050801318"], color="blue", show.legend = TRUE) +
geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID=="1050801308"], color="red", show.legend = TRUE) +
geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID=="1050835178"], color="green", show.legend = TRUE)
Assess uncharacterized Gardnerella when measuring clades:
sgCladeDF %>%
filter(n>=2) %>%
group_by(SampleID) %>%
summarise(n=sum(n)) %>%
full_join(mDF[,c("SampleID", "Gardnerella_vaginalis")], by="SampleID") %>%
filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 0 × 3
## # … with 3 variables: SampleID <chr>, n <dbl>, Gardnerella_vaginalis <dbl>
No uncharacterized Gardnerella when measuring clades.
Assess uncharacterized Gardnerella when measuring genomospecies:
sgGenomoDF %>%
filter(n>=2) %>%
group_by(SampleID) %>%
summarise(n=sum(n)) %>%
full_join(mDF[,c("SampleID", "Gardnerella_vaginalis")], by="SampleID") %>%
filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 1 × 3
## SampleID n Gardnerella_vaginalis
## <chr> <dbl> <dbl>
## 1 1061235278 NA 0.0152
One sample with uncharacterized Gardnerella by genomospecies.
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] cowplot_1.1.1 ggpubr_0.4.0 formattable_0.2.1
## [4] Biostrings_2.60.2 GenomeInfoDb_1.28.4 XVector_0.32.0
## [7] IRanges_2.26.0 S4Vectors_0.30.2 BiocGenerics_0.38.0
## [10] phyloseq_1.36.0 forcats_0.5.1 stringr_1.4.0
## [13] dplyr_1.0.7 purrr_0.3.4 readr_2.0.2
## [16] tidyr_1.1.4 tibble_3.1.5 ggplot2_3.3.5
## [19] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 ggsignif_0.6.3 ellipsis_0.3.2
## [4] fs_1.5.0 rstudioapi_0.13 farver_2.1.0
## [7] bit64_4.0.5 fansi_0.5.0 lubridate_1.8.0
## [10] xml2_1.3.2 codetools_0.2-18 splines_4.1.1
## [13] knitr_1.36 ade4_1.7-18 jsonlite_1.7.2
## [16] broom_0.7.9 cluster_2.1.2 dbplyr_2.1.1
## [19] compiler_4.1.1 httr_1.4.2 backports_1.2.1
## [22] assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0
## [25] cli_3.4.1 htmltools_0.5.2 tools_4.1.1
## [28] igraph_1.2.6 gtable_0.3.0 glue_1.4.2
## [31] GenomeInfoDbData_1.2.6 reshape2_1.4.4 Rcpp_1.0.7
## [34] carData_3.0-5 Biobase_2.52.0 cellranger_1.1.0
## [37] jquerylib_0.1.4 vctrs_0.5.1 rhdf5filters_1.4.0
## [40] multtest_2.48.0 ape_5.5 nlme_3.1-153
## [43] iterators_1.0.13 xfun_0.26 rvest_1.0.1
## [46] lifecycle_1.0.3 rstatix_0.7.0 zlibbioc_1.38.0
## [49] MASS_7.3-54 scales_1.1.1 vroom_1.5.5
## [52] hms_1.1.1 biomformat_1.20.0 rhdf5_2.36.0
## [55] yaml_2.2.1 sass_0.4.0 stringi_1.7.8
## [58] highr_0.9 foreach_1.5.1 permute_0.9-5
## [61] rlang_1.0.6 pkgconfig_2.0.3 bitops_1.0-7
## [64] evaluate_0.14 lattice_0.20-45 Rhdf5lib_1.14.2
## [67] htmlwidgets_1.5.4 labeling_0.4.2 bit_4.0.4
## [70] tidyselect_1.1.1 plyr_1.8.6 magrittr_2.0.1
## [73] R6_2.5.1 generics_0.1.0 DBI_1.1.1
## [76] pillar_1.6.3 haven_2.4.3 withr_2.4.2
## [79] mgcv_1.8-38 survival_3.2-13 abind_1.4-5
## [82] RCurl_1.98-1.5 modelr_0.1.8 crayon_1.4.1
## [85] car_3.0-12 utf8_1.2.2 tzdb_0.1.2
## [88] rmarkdown_2.11 grid_4.1.1 readxl_1.3.1
## [91] data.table_1.14.2 vegan_2.5-7 reprex_2.0.1
## [94] digest_0.6.28 munsell_0.5.0 bslib_0.3.1